# -*- coding: utf-8 -*-

"""
This module contains various text-comparison algorithms
designed to compare one statement to another.

这个模块包含各种文本比较算法。
设计用来比较一个语句到另一个语句。
"""


class Comparator:

    def __call__(self, statement_a, statement_b):
        return self.compare(statement_a, statement_b)

    def compare(self, statement_a, statement_b):
        return 0

    def get_initialization_functions(self):
        """
        Return all initialization methods for the comparison algorithm.
        Initialization methods must start with 'initialize_' and
        take no parameters.
        返回的比较算法的初始化方法。
        初始化方法必须以“initialize_”
        不带参数。
        """
        initialization_methods = [
            (
                method,
                getattr(self, method),
            ) for method in dir(self) if method.startswith('initialize_')
        ]

        return {
            key: value for (key, value) in initialization_methods
        }


class LevenshteinDistance(Comparator):
    """
    Compare two statements based on the Levenshtein distance
    of each statement's text.

    For example, there is a 65% similarity between the statements
    "where is the post office?" and "looking for the post office"
    based on the Levenshtein distance algorithm.
    比较基于编辑距离的两个语句
    每个语句的文本。
    例如，有65%的相似性之间的语句
    “邮局在哪里？”和“寻找邮局”
    基于编辑距离算法。
    """

    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        比较两个输入语句。
        返回：语句文本之间相似度的百分比。
        ：R型：浮
        """
        import sys

        # Use python-Levenshtein if available如果有使用Python编辑
        try:
            from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
        except ImportError:
            from difflib import SequenceMatcher

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        #返回0如果声明有falsy文本价值
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings有两个字符串的小写版本
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower()) # NOQA
            other_statement_text = unicode(other_statement.text.lower()) # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )

        # Calculate a decimal percent of the similarity计算一个小数百分之的相似性
        percent = round(similarity.ratio(), 2)
        return percent



class SynsetDistance(Comparator):
    """
    Calculate the similarity of two statements.
    This is based on the total maximum synset similarity between each word in each sentence.

    This algorithm uses the `wordnet`_ functionality of `NLTK`_ to determine the similarity
    of two statements based on the path similarity between each token of each statement.
    This is essentially an evaluation of the closeness of synonyms.
    计算两个语句的相似度。
    这是基于同义词集之间的相似性最大每字每句。
    该算法利用` WordNet ` _功能` NLTK ` _确定相似
    基于每个语句的每个标记之间的路径相似度的两种语句。
    这本质上是对同义词接近程度的评价。
    """

    def initialize_nltk_wordnet(self):
        """
        Download required NLTK corpora if they have not already been downloaded.
        下载所需的NLTK语料库，如果他们还没有被下载。
        """
        from .utils import nltk_download_corpus

        nltk_download_corpus('corpora/wordnet')

    def initialize_nltk_punkt(self):
        """
        Download required NLTK corpora if they have not already been downloaded.
        下载所需的NLTK语料库，如果他们还没有被下载。
        """
        from .utils import nltk_download_corpus

        nltk_download_corpus('tokenizers/punkt')

    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the closest synset distance.
        :rtype: float

        .. _wordnet: http://www.nltk.org/howto/wordnet.html
        .. _NLTK: http://www.nltk.org/

        比较两个输入语句。
        ：返回：最接近的同义词集的距离之间的相似率。
        ：R型：浮
        .._wordnet：http://www.nltk.org/howto/wordnet.html
        .._nltk：http://www.nltk.org/
        """
        from nltk.corpus import wordnet
        from nltk import word_tokenize
        from chatterbot import utils
        import itertools

        tokens1 = word_tokenize(statement.text.lower())
        tokens2 = word_tokenize(other_statement.text.lower())

        # Remove all stop words from the list of word tokens从单词标记列表中删除所有停用词
        tokens1 = utils.remove_stopwords(tokens1, language='english')
        tokens2 = utils.remove_stopwords(tokens2, language='english')

        # The maximum possible similarity is an exact match
        # Because path_similarity returns a value between 0 and 1,
        # max_possible_similarity is the number of words in the longer
        # of the two input statements.
        #最大可能的相似性是精确匹配
        #因为path_similarity返回0和1之间的值，
        # max_possible_similarity在较长的单词的数量
        #的两输入语句。
        max_possible_similarity = max(
            len(statement.text.split()),
            len(other_statement.text.split())
        )

        max_similarity = 0.0

        # Get the highest matching value for each possible combination of words
        #为每个可能的单词组合获取最高匹配值。
        for combination in itertools.product(*[tokens1, tokens2]):

            synset1 = wordnet.synsets(combination[0])
            synset2 = wordnet.synsets(combination[1])

            if synset1 and synset2:

                # Get the highest similarity for each combination of synsets
                #得到的每个组合的相似性最高的同义词
                for synset in itertools.product(*[synset1, synset2]):
                    similarity = synset[0].path_similarity(synset[1])

                    if similarity and (similarity > max_similarity):
                        max_similarity = similarity

        if max_possible_similarity == 0:
            return 0

        return max_similarity / max_possible_similarity


class SentimentComparison(Comparator):
    """
    Calculate the similarity of two statements based on the closeness of
    the sentiment value calculated for each statement.

    根据贴近度计算两个语句的相似度
    每个语句计算的情感值。
    """

    def initialize_nltk_vader_lexicon(self):
        """
        Download the NLTK vader lexicon for sentiment analysis
        that is required for this algorithm to run.
        下载情感分析NLTK维达词汇
        这个算法需要运行。
        """
        from .utils import nltk_download_corpus

        nltk_download_corpus('sentiment/vader_lexicon')

    def compare(self, statement, other_statement):
        """
        Return the similarity of two statements based on
        their calculated sentiment values.

        :return: The percent of similarity between the sentiment value.
        :rtype: float
        返回基于两个语句相似度
        他们的情绪值的计算。
        返回：：的情感价值之间的相似率。
        ：R型：浮
        """
        from nltk.sentiment.vader import SentimentIntensityAnalyzer

        sentiment_analyzer = SentimentIntensityAnalyzer()
        statement_polarity = sentiment_analyzer.polarity_scores(statement.text.lower())
        statement2_polarity = sentiment_analyzer.polarity_scores(other_statement.text.lower())

        statement_greatest_polarity = 'neu'
        statement_greatest_score = -1
        for polarity in sorted(statement_polarity):
            if statement_polarity[polarity] > statement_greatest_score:
                statement_greatest_polarity = polarity
                statement_greatest_score = statement_polarity[polarity]

        statement2_greatest_polarity = 'neu'
        statement2_greatest_score = -1
        for polarity in sorted(statement2_polarity):
            if statement2_polarity[polarity] > statement2_greatest_score:
                statement2_greatest_polarity = polarity
                statement2_greatest_score = statement2_polarity[polarity]

        # Check if the polarity if of a different type检查是否有不同类型的极性
        if statement_greatest_polarity != statement2_greatest_polarity:
            return 0

        values = [statement_greatest_score, statement2_greatest_score]
        difference = max(values) - min(values)

        return 1.0 - difference


class JaccardSimilarity(Comparator):
    """
    Calculates the similarity of two statements based on the Jaccard index.

    The Jaccard index is composed of a numerator and denominator.
    In the numerator, we count the number of items that are shared between the sets.
    In the denominator, we count the total number of items across both sets.
    Let's say we define sentences to be equivalent if 50% or more of their tokens are equivalent.
    Here are two sample sentences:

        The young cat is hungry.
        The cat is very hungry.

    When we parse these sentences to remove stopwords, we end up with the following two sets:

        {young, cat, hungry}
        {cat, very, hungry}

    In our example above, our intersection is {cat, hungry}, which has count of two.
    The union of the sets is {young, cat, very, hungry}, which has a count of four.
    Therefore, our `Jaccard similarity index`_ is two divided by four, or 50%.
    Given our similarity threshold above, we would consider this to be a match.

    .. _`Jaccard similarity index`: https://en.wikipedia.org/wiki/Jaccard_index

    计算基于Jaccard指数两个语句相似度。
    Jaccard指数是由分子和分母。
    在分子，我们算是套之间共享项目数。
    在分母，我们指望在这两套项目总数。
    比方说，我们定义语句是等价的如果他们的令牌50%个或更多的是等价的。
    这里有两个例句：
    年轻的猫饿了。
    猫很饿。
    当我们分析这些句子去除停用词，我们结束了以下两套：
    {年轻的猫，饿了}
    {猫非常饿}
    在上面的例子中，我们的交集是{猫饿}，这算两个。
    本集的并集是{年轻，猫非常饿}，其中有四个数。
    因此，我们` Jaccard相似性指数` _两除以四，或50%。
    鉴于我们的相似度阈值以上，我们会认为这是一个比赛。
    .._ ` Jaccard相似性指数`：https://en.wikipedia.org/wiki/jaccard_index
    """

    SIMILARITY_THRESHOLD = 0.5

    def initialize_nltk_wordnet(self):
        """
        Download the NLTK wordnet corpora that is required for this algorithm
        to run only if the corpora has not already been downloaded.
        下载NLTK WordNet的语料库，本文提出的算法是必需的
        只有在尚未下载语料库的情况下才能运行。
        """
        from .utils import nltk_download_corpus

        nltk_download_corpus('corpora/wordnet')

    def compare(self, statement, other_statement):
        """
        Return the calculated similarity of two
        statements based on the Jaccard index.
        返回两个计算的相似度
        基于Jaccard指数报表。
        """
        from nltk.corpus import wordnet
        import nltk
        import string

        a = statement.text.lower()
        b = other_statement.text.lower()

        # Get default English stopwords and extend with punctuation得到默认的英语stopwords和延长使用标点符号
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.extend(string.punctuation)
        stopwords.append('')
        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()#按屈折变化形式和异体形式将词归类

        def get_wordnet_pos(pos_tag):
            if pos_tag[1].startswith('J'):
                return (pos_tag[0], wordnet.ADJ)
            elif pos_tag[1].startswith('V'):
                return (pos_tag[0], wordnet.VERB)
            elif pos_tag[1].startswith('N'):
                return (pos_tag[0], wordnet.NOUN)
            elif pos_tag[1].startswith('R'):
                return (pos_tag[0], wordnet.ADV)
            else:
                return (pos_tag[0], wordnet.NOUN)

        ratio = 0
        pos_a = map(get_wordnet_pos, nltk.pos_tag(nltk.tokenize.word_tokenize(a)))
        pos_b = map(get_wordnet_pos, nltk.pos_tag(nltk.tokenize.word_tokenize(b)))
        lemma_a = [
            lemmatizer.lemmatize(
                token.strip(string.punctuation),
                pos
            ) for token, pos in pos_a if pos == wordnet.NOUN and token.strip(
                string.punctuation
            ) not in stopwords
        ]
        lemma_b = [
            lemmatizer.lemmatize(
                token.strip(string.punctuation),
                pos
            ) for token, pos in pos_b if pos == wordnet.NOUN and token.strip(
                string.punctuation
            ) not in stopwords
        ]

        # Calculate Jaccard similarity Jaccard相似度计算
        try:
            ratio = len(set(lemma_a).intersection(lemma_b)) / float(len(set(lemma_a).union(lemma_b)))
        except Exception as e:
            print('Error', e)
        return ratio >= self.SIMILARITY_THRESHOLD


# ---------------------------------------- #


levenshtein_distance = LevenshteinDistance()
synset_distance = SynsetDistance()
sentiment_comparison = SentimentComparison()
jaccard_similarity = JaccardSimilarity()
